In [1]:
from __future__ import division
import pandas as pd
import numpy as np
import os
import graphlab as gl

In [2]:
na_values = ['-99900.0','-99901.0','-99903.0','999.0','nan']
train = gl.SFrame.read_csv(os.path.join("data", "train_2013.csv"))


[INFO] Start server at: ipc:///tmp/graphlab_server-3802 - Server binary: /usr/local/lib/python2.7/dist-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1428696993.log
[INFO] GraphLab Server Version: 1.3.0
PROGRESS: Finished parsing file /media/vladimir/1ab2d5e6-a134-47e7-ba27-b2d70ac5ffc5/workspace/kaggle_rain/data/train_2013.csv
PROGRESS: Parsing completed. Parsed 100 lines in 11.8519 secs.
PROGRESS: Read 55681 lines. Lines per second: 26798.7
PROGRESS: Read 278243 lines. Lines per second: 37478.5
PROGRESS: Read 501654 lines. Lines per second: 39804.3
PROGRESS: Read 724871 lines. Lines per second: 40650.7
PROGRESS: Read 946730 lines. Lines per second: 40909.7
PROGRESS: Finished parsing file /media/vladimir/1ab2d5e6-a134-47e7-ba27-b2d70ac5ffc5/workspace/kaggle_rain/data/train_2013.csv
PROGRESS: Parsing completed. Parsed 1126694 lines in 26.5083 secs.
------------------------------------------------------
Inferred types from first line of file as 
column_type_hints=[int,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,float]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------

In [4]:
train.head()


Out[4]:
Id TimeToEnd DistanceToRadar Composite
1 56.0 37.0 31.0 25.0 19.0
13.0 7.0 2.0 ...
30.0 30.0 30.0 30.0 30.0
30.0 30.0 30.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
2 58.0 48.0 38.0 29.0 19.0
9.0 ...
77.0 77.0 77.0 77.0 77.0
77.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
3 59.0 20.0 75.0 75.0 -99900.0 -99900.0
4 53.0 43.0 34.0 24.0 14.0
5.0 ...
21.0 21.0 21.0 21.0 21.0
21.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
5 56.0 52.0 43.0 59.0 54.0
48.0 42.0 36.0 31.0 57.0 ...
69.0 69.0 69.0 83.0 83.0
83.0 83.0 83.0 83.0 54.0 ...
23.0 24.0 22.0 15.5 14.5
16.0 15.0 18.5 12.5 16.0 ...
6 56.0 47.0 37.0 27.0 18.0
8.0 ...
1.0 1.0 1.0 1.0 1.0 1.0 -99900.0 -99900.0
-99900.0 -4.0 -99900.0 ...
7 59.0 55.0 51.0 46.0 42.0
38.0 33.0 29.0 25.0 20.0 ...
42.0 42.0 42.0 42.0 42.0
42.0 42.0 42.0 42.0 42.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
8 57.0 54.0 51.0 48.0 44.0
41.0 38.0 35.0 22.0 43.0 ...
10.0 10.0 10.0 10.0 10.0
10.0 10.0 10.0 10.0 8.0 ...
26.0 38.0 39.5 36.0 33.0
37.0 37.5 36.5 -99900.0 ...
9 36.0 26.0 92.0 92.0 -99900.0 -99900.0
10 15.0 5.0 53.0 43.0 33.0
14.0 9.0 3.0 ...
90.0 90.0 63.0 63.0 63.0
12.0 12.0 12.0 ...
13.0 12.0 -99900.0
-99900.0 -99900.0 9.5 ...
HybridScan HydrometeorType Kdp RR1
-99900.0 -99900.0
-99900.0 -99900.0 ...
8.0 8.0 8.0 8.0 8.0 8.0
8.0 8.0 ...
0.0 0.0 0.0 0.0 0.0 0.0
0.0 0.0 ...
0.0 0.0 0.0 0.0 0.0 0.0
0.0 0.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
8.0 8.0 8.0 8.0 8.0 8.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
-99900.0 -99900.0 8.0 8.0 0.0 0.0 0.0 0.0
-99900.0 -99900.0
-99900.0 -99900.0 ...
8.0 8.0 8.0 8.0 8.0 8.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
13.5 15.5 19.0 -99900.0
-99900.0 -99900.0 ...
9.0 9.0 9.0 8.0 8.0 8.0
8.0 9.0 9.0 9.0 9.0 9.0 ...
0.0 0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 0.0 0.0 0.0 ...
0.0 0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 1.27899 0.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
8.0 8.0 8.0 8.0 8.0 8.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
-99900.0 -99900.0
-99900.0 -99900.0 ...
8.0 8.0 8.0 8.0 8.0 8.0
8.0 8.0 8.0 8.0 8.0 8.0 ...
0.0 0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 0.0 0.0 0.0 ...
0.0 0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 -99900.0 ...
47.0 44.0 42.0 32.0 41.5
26.5 30.5 -99900.0 ...
9.0 13.0 13.0 13.0 9.0
9.0 13.0 9.0 8.0 8.0 8.0 ...
0.0 0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 0.0 0.0 0.0 ...
0.0 6.048 4.66107 4.46988
3.07344 4.99969 4.3752 ...
-99900.0 -99900.0 8.0 8.0 0.0 0.0 0.0 0.0
11.0 13.5 -99900.0
-99900.0 -5.0 9.5 11.5 ...
8.0 9.0 8.0 8.0 8.0 8.0
8.0 8.0 ...
0.0 0.0 0.0 0.0 0.0 0.0
0.0 0.0 ...
0.0 0.0 0.0 0.0 0.0 0.0
0.0 0.0 ...
RR2 RR3 RadarQualityIndex Reflectivity
-99900.0 -99900.0
-99900.0 -99900.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
0.006246 0.0200476
0.0113924 0.217157 ...
13.0 17.5 14.0 8.5 7.0
11.0 9.0 9.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
999.0 999.0 999.0 999.0
999.0 999.0 ...
15.0 18.5 10.5 3.0 0.5
-3.0 ...
-99900.0 -99900.0 -99900.0 -99900.0 999.0 999.0 6.5 4.0
-99900.0 -99900.0
-99900.0 -99900.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
0.0 0.0 0.0 0.0 0.0 0.0 11.0 14.0 12.0 11.0 13.0
15.5 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
1.0 1.0 1.0 0.0 0.0 0.0
0.0 1.0 0.0 0.996433 0.0 ...
14.0 14.0 17.0 24.5 23.5
21.5 25.0 16.0 21.0 16.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
999.0 999.0 999.0 999.0
999.0 999.0 ...
-13.5 -8.5 9.5 14.0 13.0
15.5 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
0.0483593 0.0583249
0.103542 0.0419694 ...
15.5 16.0 3.5 -6.5 13.5
15.0 18.0 3.0 17.5 21.0 ...
-99900.0 9.72906 6.48322
6.28992 2.42506 4.78497 ...
-99900.0 -1.31778 18.6753
-6.69155 12.8562 -8.9 ...
1.0 1.0 1.0 1.0 1.0 1.0
1.0 0.948379 0.407035 ...
26.0 38.0 37.0 36.0 32.5
37.0 37.5 8.5 11.0 17.5 ...
-99900.0 -99900.0 -99900.0 -99900.0 999.0 999.0 17.0 5.5
-99900.0 -99900.0
-99900.0 -99900.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
0.0789454 0.765964
0.178427 0.0386251 ...
13.0 12.0 4.5 16.0 2.5
9.5 11.5 12.0 ...
ReflectivityQC RhoHV Velocity Zdr
-99900.0 -99900.0
-99900.0 -99900.0 ...
0.865 0.841667 0.765
0.985 0.768333 0.491667 ...
-99901.0 -99901.0
-99901.0 -99901.0 ...
7.9375 4.5 4.1875 5.5625
3.375 7.0625 5.3125 6 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
0.635 0.851667 0.891667
0.638333 0.791667 ...
-4.0 -3.0 -2.0 -0.5 -4.0
3.0 ...
2.6875 3.0 2.375 6.25
3.125 6.0625 ...
-99900.0 -99900.0 0.998333 0.891667 -99900.0 -3.5 -6.5 -4.6875
-99900.0 -99900.0
-99900.0 -99900.0 ...
0.688333 0.518333
0.708333 0.805 0.708333 ...
-7.0 -12.0 -11.5 -8.5
-8.0 -13.0 ...
-0.375 5.0625 1.1875 2.0
2.0625 0.3125 ...
14.0 14.0 17.0 -99900.0
-99900.0 -99900.0 ...
1.01833 1.01167 0.991667
1.015 1.015 1.005 1.0 ...
14.0 13.5 12.5 -13.5
-19.5 -16.0 -15.0 -14.0 ...
0.9375 -0.875 -0.75 0.0
0.0625 0.3125 0.5625 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
1.05167 0.988333 0.298333
0.215 0.301667 0.235 ...
15.0 8.0 5.5 7.5 7.0 7.0 -0.9375 0.8125 3.1875
3.3125 -1.1875 1.6875 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
0.901667 0.661667
0.688333 0.465 0.845 ...
-2.0 13.5 -3.5 5.5 -4.5
3.0 -1.5 1.0 0.5 3.5 7.5 ...
7.5 -0.0625 5.5625
-4.5625 1.5 1.5625 2. ...
26.0 38.0 37.0 36.0 32.5
37.0 37.5 8.5 -99900.0 ...
0.958333 0.978333
0.988333 0.991667 0.995 ...
10.5 10.0 10.0 12.0 12.5
12.5 14.5 -99900.0 16.0 ...
0.375 -0.3125 0.5625
1.0625 1.5625 0.8125 ...
-99900.0 -99900.0 0.948333 0.641667 -10.0 -99900.0 0.5 -5.8125
13.0 12.0 -99900.0
-99900.0 -99900.0 9.5 ...
0.971667 1.05167 0.915
0.208333 0.888333 1.0 ...
-12.0 -9.0 -18.5 -8.5
-99900.0 13.5 14.0 13.0 ...
2.375 0.0 0.25 4.1875
4.0625 0.625 0.875 -1.5 ...
LogWaterVolume MassWeightedMean MassWeightedSD Expected
nan nan nan nan nan nan
nan nan ...
nan nan nan nan nan nan
nan nan ...
nan nan nan nan nan nan
nan nan ...
0.0
nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan 0.0
nan nan nan nan nan nan 0.0
nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan 0.0
-13.4793885769
-12.1370512402 ...
1.86413642918
1.27740873124 ...
0.755068594278
0.502681241559 ...
0.0
nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan 0.0
nan nan nan nan nan nan
nan nan nan nan nan nan ...
nan nan nan nan nan nan
nan nan nan nan nan nan ...
nan nan nan nan nan nan
nan nan nan nan nan nan ...
0.0
-10.3712052779
-7.19622143405 ...
1.73805086725
1.51845864912 ...
0.653683396104
0.585207001984 ...
5.6
nan nan nan nan nan nan 0.0
-14.7984398705
-13.315794926 nan nan ...
2.54496998173
1.57016791355 nan nan ...
1.05628634415
0.639012953381 nan nan ...
0.0
[10 rows x 20 columns]

Let's split each row.


In [5]:
na_values = ['-99900.0','-99901.0','-99903.0','999.0','nan']

In [6]:
def split_row(x):
    temp = x.strip().split()
    result = []
    for x in temp:
        if x not in na_values:      
        
            result += [float(x)]
    if result == []:
        return np.nan
    return result[-1]

In [7]:
features = train.column_names()
print features


['Id', 'TimeToEnd', 'DistanceToRadar', 'Composite', 'HybridScan', 'HydrometeorType', 'Kdp', 'RR1', 'RR2', 'RR3', 'RadarQualityIndex', 'Reflectivity', 'ReflectivityQC', 'RhoHV', 'Velocity', 'Zdr', 'LogWaterVolume', 'MassWeightedMean', 'MassWeightedSD', 'Expected']

In [8]:
train_splitted = gl.SFrame()
for column in ['TimeToEnd', 'DistanceToRadar', 'Composite', 'HybridScan', 'HydrometeorType', 'Kdp', 'RR1', 'RR2', 'RR3', 'RadarQualityIndex', 'Reflectivity', 'ReflectivityQC', 'RhoHV', 'Velocity', 'Zdr', 'LogWaterVolume', 'MassWeightedMean', 'MassWeightedSD']:
    train_splitted[column] = train[column].apply(split_row)

In [10]:
train_splitted["Id"] = train["Id"]

In [11]:
train_splitted["Expected"] = train["Expected"]

In [12]:
train_splitted.head()


Out[12]:
TimeToEnd DistanceToRadar Composite HybridScan HydrometeorType Kdp RR1 RR2 RR3 RadarQualityIndex
2.0 30.0 nan nan 8.0 0.0 0.0 nan nan 0.147393
9.0 77.0 nan nan 8.0 0.0 0.0 nan nan nan
20.0 75.0 nan nan 8.0 0.0 0.0 nan nan nan
5.0 21.0 nan nan 8.0 0.0 0.0 nan nan 0.0
44.0 58.0 28.0 16.5 8.0 0.0 0.0 0.518951 -12.2587 1.0
8.0 1.0 -4.0 nan 8.0 0.0 0.0 nan nan nan
3.0 42.0 nan nan 8.0 0.0 0.0 nan nan 0.187933
49.0 20.0 23.5 17.5 9.0 0.0 0.0 6.79837 1.03084 0.407035
26.0 92.0 nan nan 8.0 0.0 0.0 nan nan nan
3.0 12.0 11.5 11.5 8.0 0.0 0.0 nan nan 0.855509
Reflectivity ReflectivityQC RhoHV Velocity Zdr LogWaterVolume MassWeightedMean MassWeightedSD
9.0 nan 1.05167 nan 6.125 nan nan nan
-3.0 nan 0.738333 3.0 6.0625 nan nan nan
4.0 nan 0.891667 -3.5 -4.6875 nan nan nan
15.5 nan 0.555 -13.0 0.3125 nan nan nan
14.5 3.5 1.005 -3.5 0.125 -12.4712061605 1.78860878333 0.697306959147
15.5 nan 0.235 7.0 1.6875 nan nan nan
-2.0 nan 0.811667 -2.0 3.375 nan nan nan
23.5 23.5 1.05167 16.0 -0.25 -7.62647279713 1.6704439729 0.63283388354
5.5 nan 0.641667 -10.0 -5.8125 nan nan nan
12.0 11.5 0.985 13.0 -1.5 -14.0125544939 1.85157078465 0.732113508508
Id Expected
1 0.0
2 0.0
3 0.0
4 0.0
5 0.0
6 0.0
7 0.0
8 5.6
9 0.0
10 0.0
[10 rows x 20 columns]


In [15]:
def bin_expected(x):
    if x >= 69:
        return 70
    else:
        return int(x)

In [25]:
train_splitted["Expected"] = train_splitted["Expected"]

In [30]:
train_splitted["Expected"] = train_splitted["Expected"].apply(bin_expected)

In [31]:
#save train_mean to file
train_splitted.to_dataframe().to_csv(os.path.join("data", "train_last.csv"), index=False)

In [32]:
test = gl.SFrame.read_csv(os.path.join("data", "test_2014.csv"))


PROGRESS: Finished parsing file /media/vladimir/1ab2d5e6-a134-47e7-ba27-b2d70ac5ffc5/workspace/kaggle_rain/data/test_2014.csv
PROGRESS: Parsing completed. Parsed 100 lines in 9.42861 secs.
PROGRESS: Read 52033 lines. Lines per second: 26933
PROGRESS: Read 311201 lines. Lines per second: 43142.4
PROGRESS: Read 570544 lines. Lines per second: 44678.2
PROGRESS: Finished parsing file /media/vladimir/1ab2d5e6-a134-47e7-ba27-b2d70ac5ffc5/workspace/kaggle_rain/data/test_2014.csv
PROGRESS: Parsing completed. Parsed 630452 lines in 13.1907 secs.
------------------------------------------------------
Inferred types from first line of file as 
column_type_hints=[int,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------

In [33]:
test_splitted = gl.SFrame()
for column in ['TimeToEnd', 'DistanceToRadar', 'Composite', 'HybridScan', 'HydrometeorType', 'Kdp', 'RR1', 'RR2', 'RR3', 'RadarQualityIndex', 'Reflectivity', 'ReflectivityQC', 'RhoHV', 'Velocity', 'Zdr', 'LogWaterVolume', 'MassWeightedMean', 'MassWeightedSD']:
    test_splitted[column] = test[column].apply(split_row)

In [34]:
test_splitted["Id"] = test["Id"]

In [36]:
test_splitted.to_dataframe().to_csv(os.path.join("data", "test_last.csv"), index=False)

In [ ]: